# load "Ask a A Manager 2021 Survey" googlesheet
# https://www.askamanager.org/
ask_a_manager_2021 <- read_csv(here::here("data","ask_a_manager.csv"))

skimr::skim(ask_a_manager_2021)
Data summary
Name ask_a_manager_2021
Number of rows 26765
Number of columns 18
_______________________
Column type frequency:
character 14
logical 2
numeric 1
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
how_old_are_you 0 1.00 5 10 0 7 0
industry 62 1.00 2 171 0 1084 0
job_title 0 1.00 1 126 0 12838 0
additional_context_on_job_title 19868 0.26 1 781 0 6612 0
currency 0 1.00 3 7 0 11 0
additional_context_on_income 23851 0.11 1 1143 0 2839 0
country 0 1.00 1 209 0 297 0
state 4761 0.82 4 114 0 125 0
city 23 1.00 1 171 0 4070 0
overall_years_of_professional_experience 0 1.00 9 16 0 8 0
years_of_experience_in_field 0 1.00 9 16 0 8 0
highest_level_of_education_completed 202 0.99 3 34 0 6 0
gender 155 0.99 3 29 0 5 0
race 151 0.99 5 125 0 47 0

Variable type: logical

skim_variable n_missing complete_rate mean count
other_monetary_comp 26765 0 NaN :
currency_other 26765 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
annual_salary 0 1 144988 5488158 0 54000 75712 110000 8.7e+08 ▇▁▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
timestamp 0 1 2021-04-27 11:02:09 2021-09-14 21:55:44 2021-04-28 12:35:21 23989

Data cleaning

After skimming the dataset, we took the below steps to clean the raw data :

Removing NA values

# remove the NA values
ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  filter(!gender %in% c("NA","Other or prefer not to answer","Prefer not to answer", "Non-binary", NA) &
           race!="NA" &
           highest_level_of_education_completed!="NA")

Removing outliers

# using z-score. remove data out of 3 sigma
ask_a_manager_2021 <- ask_a_manager_2021[-which(abs(scale(ask_a_manager_2021$annual_salary))>3),]
# using quantile. remove data larger than 75%+1.5*IQR or smaller than 25%-1.5*IQR
# Q <- quantile(ask_a_manager_2021$salary, probs=c(.25, .75))
# iqr <- IQR(ask_a_manager_2021$salary)
# up <-  Q[2]+1.5*iqr # Upper Range  
# up <-  Q[1]-1.5*iqr # Upper Range  
# ask_a_manager_2021 <- ask_a_manager_2021 %>% 
#   filter(salary>=low & salary<=up)

Transforming into ordered factor datatype

Namely, we want to make age, years of working experience, education into factor, and take the log of salary for the next step of EDA.

ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  mutate(
    country = countrycode::countryname(country),
    age_group = factor(how_old_are_you,levels=c("under 18","18-24","25-34","35-44","45-54","55-64","65 or over")),
    field_exp = factor(years_of_experience_in_field,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")),
    pro_exp = factor(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")),
education = factor(highest_level_of_education_completed, levels=c("High School","Some college","College degree","Master's degree","Professional degree (MD, JD, etc.)", "PhD")),
     higher_edu = ifelse(education %in% c("College degree","Master's degree", "PhD", "Professional degree (MD, JD, etc.)"),1,0),
    salary = annual_salary,
    log_salary = log(annual_salary)
  ) %>% 
  janitor::clean_names() # clean columns names

Selecting industries

Since there are over 1000 reported industries in the dataset, we will only keep the top 25 industries which account for 92% of total entries

# Industry is messy... it has > 1000 different industries.
# we only keep the names of top 25 industries. (24597(92%) out of 26765 entries)
industry_list <- ask_a_manager_2021 %>% 
  count(industry, sort=TRUE) %>% 
  mutate(percent = 100* n/sum(n)) %>% 
  slice_max(order_by = n, n = 25) %>% 
  select(industry)

ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  mutate(industry = ifelse(industry %in% industry_list$industry,
                           industry,
                           "Others"))

Unifying currencies

Since there are 11 different currencies reported in the dataset, we want to unify the most common currencies into USD for the convenience of comparison analysis afterwards. We decided to convert all currencies with occurences greater than 100 into USD. This included CAD,GBP,EUR,AUD/NZD. Other was not converted as there wasn’t specific information on the currency.

ask_a_manager_2021 %>%
  count(currency, sort=TRUE) %>%
  mutate(percent = 100* n/sum(n))
currencynpercent
USD2118883.6   
CAD14925.88  
GBP14425.69  
EUR5582.2   
AUD/NZD4561.8   
Other1230.485 
CHF320.126 
SEK320.126 
JPY170.067 
ZAR130.0513
HKD40.0158

From the above table, We can see that USD, CAD, GBP, EUR, (AUD/NZD) are the most common currencies. We will convert the currencies all to USD using exchange rate data from library quantmod, and the exchange rate will be taken from the last date of the survey “2021-09-14”

library(quantmod)
#library to get currency data (and other financial data)

#gets the currency rate for the currency pairs. This uses the Oanda which keeps currency data for the last 180 days.
#In 180 days from "2021-09-14" the code will no longer return the currency rates.
currencies <-getSymbols(c("EUR/USD","CAD/USD","GBP/USD","AUD/USD" , "NZD/USD"),
                        from="2021-09-14", to="2021-09-14", src="oanda")

#In order to preserve the analysis for after 180 days we can hard code the currency rates into a tibble
currencies_preserved <- tibble(
  name = c("EUR/USD","CAD/USD","GBP/USD","AUD/USD" , "NZD/USD"),
  value = c(1.18,0.79,1.38,0.734,0.711)
)

Here we create a new column called converted_salary which converts the 5 most common currencies in USD. These are then rounded to the nearest whole number.

#the values are taken from the currencies_preserved$value as to preserve the analysis when 
# getSymbols will no longer work. The index matches the currency pair.
ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  mutate(converted_salary = case_when(
    currency == "GBP" ~ round(annual_salary*currencies_preserved$value[3]),
    currency == "AUD/NZD" & country == "Australia" ~ round(annual_salary*currencies_preserved$value[4]),
    currency == "AUD/NZD" & country == "New Zealand" ~ round(annual_salary*currencies_preserved$value[5]),
    currency == "CAD" ~ round(annual_salary*currencies_preserved$value[2]),
    currency == "EUR" ~ round(annual_salary*currencies_preserved$value[1]),
    TRUE ~ round(annual_salary)
  ))
#Here we filter the dataset to only include the salaries that were converted into USD terms. The other currencies also included lots of outliers which we also get rid of here.

ask_a_manager_2021 <- ask_a_manager_2021 %>% 
  filter(currency %in% c("USD","GBP","AUD/NZD","EUR","CAD") &
           !is.na(country))

ask_a_manager_2021 %>%
  count(currency, sort=TRUE) %>%
  mutate(percent = 100* n/sum(n))
currencynpercent
USD2104284.5 
CAD14825.95
GBP13705.5 
EUR5542.22
AUD/NZD4511.81

Exploratory Data Analysis

Counts

# Some quick counts, groups, etc
ask_a_manager_2021 %>% 
  count(age_group, sort=TRUE) %>% 
  mutate(percent = 100* n/sum(n)) %>% 
  ggplot(aes(x=fct_relevel(age_group,levels=c("under 18","18-24","25-34","35-44","45-54","55-64","65 or over")), y=n)) +
  geom_bar(stat="identity") + labs(title = "Age distribution of Ask a Manager survey respondents",                                                 x="Age group",
                  y="Frequency",                                                  caption = "Source:Askamanager.com")

# 'country' 
ask_a_manager_2021 %>%
  count(country, sort=TRUE) %>% 
  mutate(percent = 100* n/sum(n))

# 'city' 
ask_a_manager_2021 %>% 
  count(city, sort=TRUE) %>% 
  mutate(percent = 100* n/sum(n))

# education 
ask_a_manager_2021 %>% 
  count(education) %>% 
  mutate(percent = 100* n/sum(n))%>% 
  arrange(desc(percent))

# gender
ask_a_manager_2021 %>% 
  count(gender) %>% 
  mutate(percent = 100* n/sum(n))%>% 
  arrange(desc(percent))

# race
ask_a_manager_2021 %>% 
  count(race) %>% 
  mutate(percent = 100* n/sum(n))%>% 
  arrange(desc(percent))

# pro_exp 
ask_a_manager_2021 %>% 
  count(pro_exp ) %>% 
  mutate(percent = 100* n/sum(n))%>% 
  arrange(desc(percent))

# field_exp  
ask_a_manager_2021 %>% 
  count(field_exp  ) %>% 
  mutate(percent = 100* n/sum(n))%>% 
  arrange(desc(percent))

#make comments about the counts 

Salary

How is salary distributed?

favstats(ask_a_manager_2021$salary) # find a really rich guy here
minQ1medianQ3maxmeansdnmissing
05.5e+047.6e+041.09e+055e+068.93e+047.07e+04248990
# density
g1 <- ggplot(ask_a_manager_2021, aes(x=salary))+
  geom_density()+
  NULL
#cdf
g2 <- ask_a_manager_2021 %>% 
  slice_min(order_by = salary,n =nrow(ask_a_manager_2021)-5) %>% # choose a number
  ggplot(aes(x=salary))+
  stat_ecdf()+
  NULL

# taking log is better
g3 <- ggplot(ask_a_manager_2021, aes(x=log(salary)))+
  geom_density()+
  NULL

g4 <- ask_a_manager_2021 %>% 
  slice_min(order_by = salary,n =nrow(ask_a_manager_2021)-5) %>% # choose a number
  ggplot(aes(x=log(salary)))+
  stat_ecdf()+
  NULL

gridExtra::grid.arrange(g1,g2,g3,g4)

Observation: This is a right skewed distribution.

Industry

ask_a_manager_2021 %>% 
  count(industry, sort=TRUE) %>% 
  ggplot(aes(y=fct_reorder(industry,n), x=n))+
  geom_col()

Observations:

  • Computing & tech is the top-chosen industry among respondents, followed by Education and Non-profit
  • This indicate that the sample might include more respondents from industries have higher exposure to tech and internet.

Salary ,Gender and Industry

We are interested in the type of industries with top salary for man and woman.

# Industries of top average salary for Man and Woman
salary_table <- ask_a_manager_2021 %>% 
  group_by(industry, gender) %>% 
  summarise(count = n(),
            avg_salary = mean(converted_salary),
            higher_edu_prop = sum(higher_edu)/n()) %>% 
  pivot_wider(id_cols = 1:5,
              names_from = gender,
              values_from = count:higher_edu_prop)
  
g_man <- salary_table %>% 
  ggplot(aes(y=fct_reorder(industry,avg_salary_Man), x=avg_salary_Man,fill=higher_edu_prop_Man))+
  geom_bar(position="dodge", stat="identity")+
  geom_text(aes(label=paste(avg_salary_Man%/%1000,"k")),
            position=position_dodge(width=0.9), hjust=0,vjust=0)+
  scale_x_continuous(labels = scales::comma) +
  scale_fill_gradient(low="sky blue", high="blue")+
  labs(
    title="Man",
    x="Annual Salary $",
    y="Industry",
    fill="Higher Education %"
  )+
  theme_wsj()+
  theme(legend.position="bottom",
        legend.title=element_text(size=13))+
  NULL

# ggsave("highest_paid_industries_male.png",width = 12,height = 8,limitsize = F)

g_woman <- salary_table %>% 
  ggplot(aes(y=fct_reorder(industry,avg_salary_Woman), x=avg_salary_Woman,fill=higher_edu_prop_Woman))+
  geom_bar( position="dodge", stat="identity")+
  geom_text(aes(label=paste(avg_salary_Woman%/%1000,"k")),
              position=position_dodge(width=0.9), hjust=0,vjust=0)+
  scale_x_continuous(labels = scales::comma) +
  scale_fill_gradient(low="pink", high="red")+
  labs(
    title="Woman",
    x="Annual Salary $",
    y="Industry",
    fill="Higher Education %"
  )+
  theme_wsj()+
  theme(legend.position="bottom",
        legend.title=element_text(size=13))+
  NULL

# ggsave("highest_paid_industries_female.png",width = 12,height = 8,limitsize = F)

Observations:

  • For both man and woman, the top earning industries are alike (Law, Computing or Tech, Business or Consulting)
  • Respondents from Law, Govenment and Education tends to have higher education level compared to the rest, reflecting the academic/professional requirements of these positions
  • The difference between the salary within sales and accounting, banking & finance is likely due to the subdivisions chosen based on gender nature

We are also interested in the salary difference between man and woman within the same industry.

# salary gap in different industries
ask_a_manager_2021 %>% 
  group_by(industry, gender) %>% 
  summarise(count = n(),
            avg_salary = mean(converted_salary)) %>% 
  pivot_wider(id_cols = 1:4,
              names_from = gender,
              values_from = count:avg_salary) %>% 
  mutate(size = count_Man+count_Woman,
         male_prop = count_Man/(count_Man+count_Woman),
         salary_gap = avg_salary_Man - avg_salary_Woman) %>% 
  ggplot(aes(y=fct_reorder(industry,salary_gap), x=salary_gap,fill=male_prop))+
  geom_col()+
  
  # geom_point(aes(y=fct_reorder(industry,size),x=size*10))+
  
  scale_x_continuous(
    breaks = seq(-20000,40000,5000)
  ) +
  scale_fill_gradient(low="pink", high="blue")+
  labs(
    title="Salary Gap and Sex Ratio",
    x="Salary Gap in $",
    y="",
    fill="Male %"
  )+
  theme_wsj()+
  theme(legend.position="bottom",
        legend.title=element_text(size=13))+
  NULL

# ggsave("salary_gap.png",width = 12,height = 8,limitsize = F)

Observations:

  • Man has a higher average salary than woman in most of the industries, with Law being the most significant
  • Woman has higher average salary in Art & Design
  • The proportion of male within computing or tech is significantly higher than that within other industries; yet the proportion only reaches c. 40%, reflecting the fact that most respondents are female because Ask_a_manager is more popular among women
  • The proportion of male is also higher in labour-intensive industries such as transport or logistics, engineering, utilities & telecommunications

Education

Next, we want to see if there is a difference in the salary gap between man and woman for different education level.

ask_a_manager_2021 %>% 
  group_by(education, gender) %>% 
  summarise(count = n(),
            avg_salary = mean(converted_salary)) %>% 
  pivot_wider(id_cols = 1:4,
              names_from = gender,
              values_from = count:avg_salary) %>% 
  mutate(size = count_Man+count_Woman,
         male_prop = count_Man/(count_Man+count_Woman),
         salary_gap = avg_salary_Man - avg_salary_Woman) %>% 
  ggplot(aes(x=education, y=salary_gap,fill=male_prop))+
  geom_col()+
  scale_y_continuous(
    breaks = seq(-20000,40000,5000)
  ) +
  scale_fill_gradient(low="pink", high="sky blue")+
  theme_wsj()+
  theme(legend.position="bottom", axis.title=element_text(size=24))+
  ggtitle("Salary Gap at Different Education level") +
  xlab("Education Level") + ylab ("Salary Gap in $") + 
  NULL

# ggsave("salary_gap_by_edu.png",width = 14,height = 8,limitsize = F)

Race

Count

ask_a_manager_2021 %>% 
  group_by(race) %>% 
  filter(n() > 500) %>% 
  summarise(count=n()) %>% 
  ggplot(aes(y=count, x=race, fill=race))+
  geom_col() +
  theme_wsj()+
  theme(legend.position = "none") +
  labs(x="Race", y="Count", title="Count by Race", subtitle = "Races with > 500 Respondents")+
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) + 
  NULL

Race and Mean Salary

ask_a_manager_2021 %>% 
  filter(industry %in% industry_list$industry) %>% 
  group_by(race) %>% 
  filter(n() > 500) %>% 
 ggplot( aes(x=race, y=converted_salary, color=race)) +
    geom_jitter(color="black", size=0.01, alpha=0.9) +
   geom_boxplot(alpha=0.4) +
   scale_fill_viridis(discrete = TRUE, alpha=0.6) +
    geom_hline(yintercept=144988, color="red") +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    theme_wsj() +
    theme(legend.position = "none")+
    theme(axis.title=element_text(size=10))+
    ggtitle("Salary Vs. Race Boxplot") +
    ylab("Salary") + xlab ("Race") +
    scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
    scale_y_continuous(labels = scales::comma, limits = c(0, 300000)) +
    NULL

Observation:

  • In our boxplot one can visualize both the distribution of the data and the medium and interquartile ranges of the data. When comparing annual salary by race, our data shows that Asians have the higher salaries in comparison to Black and African American and White sample respondents. Interestingly enough, our sample also shows that the medium salary for Black and AA respondents is higher than that of White respondents.

  • Since most repondents were white, the larger sample size is less prone to skewness due to outliers when comparing it to the other two races portrayed in the plot.

ask_a_manager_2021_race <- ask_a_manager_2021 %>%
  group_by(race) %>% 
  filter(race %in% c("Black or African American", "White")) 

t.test(salary ~ race, data = ask_a_manager_2021_race)
## 
##  Welch Two Sample t-test
## 
## data:  salary by race
## t = 1, df = 605, p-value = 0.3
## alternative hypothesis: true difference in means between group Black or African American and group White is not equal to 0
## 95 percent confidence interval:
##  -6923 26250
## sample estimates:
## mean in group Black or African American                     mean in group White 
##                                   97570                                   87906

When running a T-test on the two racial groups, White and Black or African American, the P-value we get is 0.9. Thus there is no statistically signifcant difference in their annual salary when calculating it based of off this data set.

Race and Mean Salary

Location

#This package is useful for creating a map of the USA
library(usmap)
library(ggrepel)

#create df of the list of states with >10 occurrences 
#This allows us to get data for actual states as many data entry errors
#many individuals put multiple states and this helps to fix that
states <- ask_a_manager_2021 %>% 
  group_by(state) %>%
  filter(!is.na(state)) %>%
  filter(n()>10) %>% 
  summarise(mean_salary = mean(salary),
            count_state = count(state))

states$fips <- fips(states$state)
statesalarymap <- states %>% select(state,mean_salary)

plot_usmap(data=statesalarymap, values="mean_salary", color="black") + scale_fill_continuous(
    low = "white",
    high = "light green",
    name = "Mean Salary") +
  theme(legend.position = "right") +
  labs(title="Map of US States based on Average Salary")

MAP

# Count by Industry

ask_a_manager_2021 %>% 
  group_by(industry) %>% 
  summarise(count=n()) %>% 
  ggplot(aes(y=fct_reorder(industry,count),x=count))+
  geom_col()

# null hypothesis: no difference between

Age

ask_a_manager_2021 %>% 
  group_by(age_group, gender) %>% 
  summarise(count=n()) 
## # A tibble: 13 x 3
## # Groups:   age_group [7]
##    age_group  gender count
##    <fct>      <chr>  <int>
##  1 under 18   Woman      6
##  2 18-24      Man      168
##  3 18-24      Woman    759
##  4 25-34      Man     1853
##  5 25-34      Woman   9155
##  6 35-44      Man     1907
##  7 35-44      Woman   7110
##  8 45-54      Man      616
##  9 45-54      Woman   2327
## 10 55-64      Man      156
## 11 55-64      Woman    754
## 12 65 or over Man       17
## 13 65 or over Woman     71
  #ggplot(aes(y=fct_reorder(how_old_are_you,count),x=count))+
  #geom_col()
# Count by Age Group
ask_a_manager_2021 %>% 
  group_by(age_group) %>% 
  summarise(count=n()) %>% 
  ggplot(aes(y=fct_reorder(age_group,count),x=count))+
  geom_col()

# Sorting out the people who are 18+ 
ask_a_manager_age <- ask_a_manager_2021 %>%  
                    mutate(factor(age_group,levels=c("18-24","25-34","35-44","45-54","55-64","65 or over")))
#need to change colors for male and female ?!
#there are only 6 women in under 18 category so excluding them for the graph
# since there is no male data in that category to compare
ask_a_manager_age %>% 
  filter(age_group %in% c("18-24", "25-34" , "35-44" , "45-54" , "55-64"  )) %>% 
  group_by(age_group, gender) %>% 
  summarise (median_salary_by_age = median (salary),
             mean_salary_by_age = mean(salary)) %>% 
  ggplot () +
  geom_col(aes (age_group, median_salary_by_age, fill = gender), position = position_dodge(width = 0.9), alpha = 0.7)+
  geom_point(aes(x = age_group, y=mean_salary_by_age, color = gender))+
  theme_bw()+
  #formatC(x, format = "d")+
  labs(  title =  "Median Salary by Age Group" , y="Salary" ,x = "Age Group") +
  NULL

Changed Industry vs salary

ask_a_manager_2021_changed <- ask_a_manager_2021  %>%
mutate(if_changed= if_else(overall_years_of_professional_experience == years_of_experience_in_field, "No", "Yes"))
ask_a_manager_2021_changed %>% 
ggplot(aes(x=fct_relevel(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")), y=annual_salary, color=if_changed))+
  # scatter plot
  geom_point(alpha = 0.5)+
  # linear smooth line
  geom_smooth(method = "lm")+
  # legend settings
  theme(legend.position = "bottom",
        legend.title = element_blank())+
  labs(title="Whether changing job affects your salary",
       x="Total Years Spent Working",
       y="Salary")+
  theme_bw()+
  NULL

ask_a_manager_2021_changed %>% 
  group_by(overall_years_of_professional_experience, if_changed) %>% 
  summarise (median_salary_over_time = median (salary),
             mean_salary_over_time = mean(salary)) %>% 
  ggplot () +
  geom_col(aes (fct_relevel(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")), median_salary_over_time, fill=if_changed), position = position_dodge(width = 0.9), alpha = 0.7)+
  geom_point(aes(x = overall_years_of_professional_experience, y=mean_salary_over_time, color = if_changed))+
  theme_bw()+
  #formatC(x, format = "d")+
  labs(  title =  "Median Salary over time" , y="Salary" ,x = "Prof") +
  NULL

ggsave("switch_industry.png",width = 12,height = 8,limitsize = F)

Statistical Tests

Gender vs Salary

General Statistics & Boxplot

mosaic::favstats (converted_salary ~ gender, data=ask_a_manager_2021)
genderminQ1medianQ3maxmeansdnmissing
Man06.5e+049.6e+041.41e+052.11e+061.12e+058.33e+0447170
Woman05.3e+047.2e+041e+05       5e+06       8.38e+046.63e+04201820
ask_a_manager_2021 %>% 
  ggplot( aes(x=gender, y=converted_salary, color=gender)) +
    geom_jitter(color="black", size=0.4, alpha=0.9) +
    geom_boxplot(alpha=0.3) +
    scale_fill_viridis(discrete = TRUE, alpha=0.6) +
    theme_wsj() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
   theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    theme(axis.title=element_text(size=12))+
    ggtitle("Salary Vs. Gender Boxplot with Jitter") +
    ylab("Salary") + xlab ("Race") + 
    scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
    scale_y_continuous(labels = scales::comma, limits = c(0, 1000000)) +
    NULL

For our following analysis, we will be comparing data from the male gender “Man” with data from the female gender “Woman.” We will filter out the data on those that are non binary.

T-Test

ask_a_manager_2021 <- ask_a_manager_2021 %>%
  filter(gender!="Non-binary")
t.test(converted_salary ~ gender, data = ask_a_manager_2021)
## 
##  Welch Two Sample t-test
## 
## data:  converted_salary by gender
## t = 21, df = 6183, p-value <2e-16
## alternative hypothesis: true difference in means between group Man and group Woman is not equal to 0
## 95 percent confidence interval:
##  25223 30318
## sample estimates:
##   mean in group Man mean in group Woman 
##              111551               83781

Confidence Intervals

#mosaic::favstats (annual_salary ~ gender, data=ask_a_manager_2021)
ask_a_manager_2021_gender <- ask_a_manager_2021 %>%
  mutate(gender2 = case_when(
  gender  == "Man" ~ "1",
 gender == "Woman" ~ "0",
  TRUE ~ "NA"
)) %>%
  mutate(gender2 = as.numeric(gender2))


mosaic::favstats (converted_salary ~ gender, data=ask_a_manager_2021_gender) %>%
mutate(t_critical = qt(0.975,n-1),
         sd_mean = sd/sqrt(n),
         margin_of_error = t_critical*sd_mean,
         ci_lower = mean-margin_of_error,
         ci_higher = mean+margin_of_error)
genderminQ1medianQ3maxmeansdnmissingt_criticalsd_meanmargin_of_errorci_lowerci_higher
Man06.5e+049.6e+041.41e+052.11e+061.12e+058.33e+04471701.961.21e+032.38e+031.09e+051.14e+05
Woman05.3e+047.2e+041e+05       5e+06       8.38e+046.63e+042018201.96466       914       8.29e+048.47e+04
# Add graph of two distributions and confidence intervals 

Observation:

  • Since two 95% confidence intervals do not overlap, we are at least 95% confident that the male have higher salary than the female on average.

Bootstrap Test

# hypothesis testing using infer package
diff <- ask_a_manager_2021_gender %>%
  specify(converted_salary ~ gender) %>%
  calculate(stat = "diff in means", order = c("Woman", "Man"))

set.seed(1234)
salary_in_null_world <- ask_a_manager_2021_gender %>%
  filter(gender !="Non-binary") %>%
  specify(converted_salary ~ gender) %>%
   # assuming independence of gender and salary
  hypothesize(null = "independence") %>%
   # generate 1000 reps, of type "permute"
  generate(reps = 1000, type = "permute") %>%
  # calculate statistic of difference, namely "diff in means"
   calculate(stat = "diff in means", order = c("Woman", "Man")) 

salary_in_null_world %>% 
  visualise()+
  shade_p_value(obs_stat = diff, direction = "two-sided")

p_value <- salary_in_null_world %>% 
  get_pvalue(obs_stat = diff, direction="both")
p_value
p_value
0

Observation:

  • Our null hypothesis is that the difference of annual salaries between male and female is 0 and we get a p_value very close to 0. Therefore we reject the null hypothesis with 95% confidence. There is difference between the average male and female salary.

Gender vs Education

We want to look at the relationship between one’s gender and education degree. In essence, we want to explore if there is a difference in the proportion of respondents with high school degree as their highest degree for male group and the female group. We can use the proportion test to generate the result from the male and female sample.

ask_a_manager_2021_education <- ask_a_manager_2021 %>% 
   filter(gender !="Non-binary") %>%
   mutate(below_college = if_else(highest_level_of_education_completed %in% c("High School", "Some college"), "Yes", "No")) %>% 
   group_by(gender) %>% 
   summarise(college_below=sum(below_college == "Yes"), college_or_above = sum(below_college == "No"))

ask_a_manager_2021_education
gendercollege_belowcollege_or_above
Man6874030
Woman159818584
prop.test(x = c(4030,18584), n = c(4030+687,18584+1598), alternative = "two.sided")
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c out of c4030 out of 4030 + 68718584 out of 18584 + 1598
## X-squared = 202, df = 1, p-value <2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.0773 -0.0556
## sample estimates:
## prop 1 prop 2 
##  0.854  0.921

Observations:

  • From the result of proportion test for male group and female group, we are 99% confident that the proportion of high school as highest education level for male is higher than that for female, indicating that overall female has a higher college-entry rate vs. the male.

If we repeat the process to explore the difference in proportion in acquiring a masters or above degree for both gender:

ask_a_manager_2021_education <- ask_a_manager_2021 %>% 
  filter(gender !="Non-binary") %>%
  mutate(higher_edu = if_else(highest_level_of_education_completed %in% c("Master's degree", "Professional degree (MD, JD, etc.)", "PhD"), "Yes", "No")) %>% 
  group_by(gender) %>% 
  summarise(master_below = sum(higher_edu == "No"), master_or_above=sum(higher_edu == "Yes") )


ask_a_manager_2021_education
gendermaster_belowmaster_or_above
Man30631654
Woman112608922
prop.test(x = c(1654, 8922), n = c(1654+3063, 8922+11260), alternative = "two.sided")
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c out of c1654 out of 1654 + 30638922 out of 8922 + 11260
## X-squared = 130, df = 1, p-value <2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.1068 -0.0761
## sample estimates:
## prop 1 prop 2 
##  0.351  0.442

Observations:

  • From the result of two proportion tests, we can see that even though the proportion of acquiring a college-or-above degree is less in the male group, the proportion of acquiring higher education degree (Master’s-or-above) is higher than the female group
  • This somehow explains the fact that there are more outliers in male sample with extremely high income level

Race vs Salary

Here we want to see if race is affecting the salary.

ask_a_manager_salary <- ask_a_manager_2021 %>% 
  mutate(if_white = if_else(race == "White", "Yes", "No")) %>% 
  select(if_white,salary)

t.test(salary~if_white, data = ask_a_manager_salary, success = "Yes")
## 
##  Welch Two Sample t-test
## 
## data:  salary by if_white
## t = 5, df = 4313, p-value = 6e-07
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##   5472 12543
## sample estimates:
##  mean in group No mean in group Yes 
##             96913             87906

Race vs Education

We want to look at the relationship between one’s race and education degree. Particularly, we want to see if there is a difference in education level for those being white and those being non-white.

As the above analysis for gender vs. education, we run two proportion tests for college-or-above degree percentage and master-or-above percentage between the male and female group.

ask_a_manager_2021_race <- ask_a_manager_2021 %>% 
  mutate(if_white = if_else(race == "White", "White", "Non-white"),
         below_college = if_else(highest_level_of_education_completed %in% c( "High School", "Some college"), "Yes", "No")) %>% 
  group_by(if_white) %>% 
  summarise(college_below=sum(below_college == "Yes"), college_or_above = sum(below_college == "No"))

ask_a_manager_2021_race
if_whitecollege_belowcollege_or_above
Non-white3503510
White193519104
prop.test(x = c(3510,19104), n = c(3510+350, 19104+1935), alternative = "two.sided")
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c out of c3510 out of 3510 + 35019104 out of 19104 + 1935
## X-squared = 0.05, df = 1, p-value = 0.8
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.00872  0.01132
## sample estimates:
## prop 1 prop 2 
##  0.909  0.908

Similarly, if we run the test for the propotion of master_or_above education level between white and non-white races:

ask_a_manager_2021_race <- ask_a_manager_2021 %>% 
  mutate(if_white = if_else(race == "White", "White", "Non-white"),
         higher_edu = if_else(highest_level_of_education_completed %in% c("Master's degree", "Professional degree (MD, JD, etc.)", "PhD"), "Yes", "No")) %>% 
  group_by(if_white) %>% 
  summarise(master_below = sum(higher_edu == "No"), master_or_above=sum(higher_edu == "Yes") )

ask_a_manager_2021_race
if_whitemaster_belowmaster_or_above
Non-white23471513
White119769063
prop.test(x = c(1513, 9063), n = c(1513+2347, 9063+11976), alternative = "two.sided")
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c out of c1513 out of 1513 + 23479063 out of 9063 + 11976
## X-squared = 20, df = 1, p-value = 8e-06
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.0557 -0.0219
## sample estimates:
## prop 1 prop 2 
##  0.392  0.431

Observations:

  • From the result of the proportion test, we cannot see a clear difference between the proportion of education level in white and non-white group, as the p-value is higher than 5%.

  • However, the proportion of acquiring higher education degree (Master’s-or-above) is higher in the white group vs. other races

Changed Industry vs salary

t.test(salary~if_changed, data = ask_a_manager_2021_changed)
## 
##  Welch Two Sample t-test
## 
## data:  salary by if_changed
## t = 15, df = 24238, p-value <2e-16
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  11569 15074
## sample estimates:
##  mean in group No mean in group Yes 
##             95369             82047
prop.test(~if_changed, data = ask_a_manager_2021_changed, success = "Yes")
## 
##  1-sample proportions test with continuity correction
## 
## data:  ask_a_manager_2021_changed$if_changed  [with success = Yes]
## X-squared = 198, df = 1, p-value <2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.449 0.462
## sample estimates:
##     p 
## 0.455
ask_a_manager_age %>% 
  filter(age_group %in% c("18-24", "25-34" , "35-44" , "45-54" , "55-64"  )) %>% 
  group_by(age_group, gender) %>% 
  summarise (median_salary_by_age = median (salary),
             mean_salary_by_age = mean(salary)) %>% 
  ggplot () +
  geom_col(aes (age_group, median_salary_by_age, fill = gender), position = position_dodge(width = 0.9), alpha = 0.7)+
  geom_point(aes(x = age_group, y=mean_salary_by_age, color = gender))+
  theme_bw()+
  #formatC(x, format = "d")+
  labs(  title =  "Median Salary by Age Group" , y="Salary" ,x = "Age Group") +
  NULL

Regression Analysis

colnames(ask_a_manager_2021)
reg_data <- ask_a_manager_2021 %>% 
  mutate(country = ifelse(country=="US","US","Non-US"))

m1 <- lm(converted_salary ~ education, data=reg_data)
m2 <- lm(converted_salary ~ education + age_group, data=reg_data)
m3 <- lm(converted_salary ~ education + gender, data=reg_data)
m4 <- lm(converted_salary ~ education + age_group + gender + pro_exp + field_exp, data=reg_data)
m5 <- lm(converted_salary ~ education + age_group + gender + field_exp, data=reg_data)

huxreg(m1, m2, m3, m4, m5,
       statistics = c('#observations' = 'nobs', 
                      'R squared' = 'r.squared', 
                      'Adj. R Squared' = 'adj.r.squared', 
                      'Residual SE' = 'sigma'), 
       bold_signif = 0.05, 
       stars = NULL
) %>% 
  set_caption('Comparison of models')
Comparison of models
(1)(2)(3)(4)(5)
(Intercept)73753.850 62856.643 92165.971 70905.720 65882.024 
(3031.177)(28236.014)(3065.946)(27781.746)(27527.453)
educationSome college1146.369 178.194 4490.817 4192.416 4395.104 
(3456.755)(3426.821)(3410.103)(3333.414)(3333.739)
educationCollege degree10936.411 14478.221 16607.004 20368.221 20329.478 
(3096.939)(3076.147)(3060.401)(3001.029)(2997.480)
educationMaster's degree16560.585 17400.650 23572.779 24328.428 24326.172 
(3128.923)(3106.455)(3095.672)(3036.162)(3030.978)
educationProfessional degree (MD, JD, etc.)60645.762 60842.097 68008.053 70035.175 69643.884 
(3625.164)(3597.836)(3584.357)(3518.621)(3508.744)
educationPhD30306.472 29312.578 36070.156 39374.432 38793.181 
(3597.624)(3568.901)(3553.194)(3498.655)(3478.807)
age_group18-24     -17669.678      -9879.743 -9267.739 
     (28271.961)     (27567.347)(27491.310)
age_group25-34     1111.921      -3876.079 -2008.487 
     (28188.442)     (27463.620)(27408.375)
age_group35-44     15983.686      -5471.914 -1500.703 
     (28187.758)     (27457.952)(27410.485)
age_group45-54     20370.007      -11070.008 -7739.091 
     (28206.456)     (27461.839)(27431.085)
age_group55-64     25471.862      -8051.878 -6100.110 
     (28269.960)     (27527.157)(27518.041)
age_group65 or over     17678.366      -14583.331 -14958.135 
     (29120.832)     (28447.729)(28432.793)
genderWoman          -30004.937 -26329.654 -26229.759 
          (1117.505)(1099.665)(1099.280)
pro_exp2 - 4 years               -7805.873      
               (4204.339)     
pro_exp5-7 years               -5300.576      
               (4351.911)     
pro_exp8 - 10 years               -1392.495      
               (4392.389)     
pro_exp11 - 20 years               337.324      
               (4472.274)     
pro_exp21 - 30 years               -1275.784      
               (4821.102)     
pro_exp31 - 40 years               -2767.468      
               (5781.403)     
pro_exp41 years or more               -12687.960      
               (9105.571)     
field_exp2 - 4 years               8411.615 6500.929 
               (2505.791)(2225.984)
field_exp5-7 years               17208.320 16989.085 
               (2596.075)(2275.486)
field_exp8 - 10 years               25127.188 26530.695 
               (2694.501)(2345.796)
field_exp11 - 20 years               36182.740 37489.263 
               (2725.230)(2381.517)
field_exp21 - 30 years               57087.061 57240.743 
               (3327.637)(2920.774)
field_exp31 - 40 years               51244.416 50426.815 
               (5212.230)(4654.659)
field_exp41 years or more               49007.992 42568.136 
               (12596.801)(11634.336)
#observations24899     24899     24899     24899     24899     
R squared0.028 0.047 0.056 0.101 0.100 
Adj. R Squared0.028 0.047 0.056 0.100 0.099 
Residual SE69651.152 68979.521 68665.310 67032.046 67051.976